Shareware Grab Bag

home *** CD-ROM | disk | FTP | other *** search

/ Shareware Grab Bag / Shareware Grab Bag.iso / 020 / arcsourc.arc / ARCLZW.MAC < prev next >

Wrap

Text File | 1985-07-02 | 15.5 KB | 414 lines

/* ARC - Archive utility - ARCLZW $define(tag,$$segment(@1,$$index(@1,=)+1))# $define(version,Version $tag( TED_VERSION DB =1.39), created on $tag( TED_DATE DB =07/02/85) at $tag( TED_TIME DB =10:36:00))# $undefine(tag)# $version (C) COPYRIGHT 1985 by System Enhancement Associates; ALL RIGHTS RESERVED By: Thom Henderson Description: This file contains the routines used to implement Lempel-Zev data compression, which calls for building a coding table on the fly. This form of compression is especially good for encoding files which contain repeated strings, and can often give dramatic improvements over traditional Huffman SQueezing. Language: Computer Innovations Optimizing C86 Programming notes: In this section I am drawing heavily on the LZWCOM and LZWUNC programs by Kent Williams of Cedar Rapids Iowa. This section is mainly his code, and I don't pretend to understand everything that's going on, so please refer any technical questions to him. As best as I can tell, this method works by tracing down a hash table of code strings where each entry has the property: if <string> <char> is in the table then <string> is in the table. */ #include <stdio.h> long lzwlen; /* length after crunching */ #define FALSE 0 #define TRUE !FALSE #define TABSIZE 4096 #define NO_PRED 0xFFFF #define EMPTY 0xFFFF #define NOT_FND 0xFFFF static unsigned int inbuf; /* partial input code storage */ static unsigned int outbuf; /* partial output code storage */ static unsigned char stack[TABSIZE]; /* local push and pop stack */ static int sp; /* current stack pointer */ static struct entry /* string table entry format */ { char used; /* true when this entry is in use */ unsigned int next; /* ptr to next in collision list */ unsigned int predecessor; /* code for preceeding string */ unsigned char follower; /* char following string */ } string_tab[TABSIZE]; /* the code string table */ /* The h() routine is used to calculate a hash value, using the "mid-square" algorithm. That is, a primitive hash value is calculated by taking the middle twelve bits of the square of a key. */ static unsigned h(pred,foll) /* calculate basic hash value */ unsigned int pred; /* code for preceeding string */ unsigned char foll; /* value of following char */ { long local; /* local hash value */ local = (pred + foll) | 0x0800; /* create the hash key */ local *= local; /* square it */ return (local >> 6) & 0x0FFF; /* return the middle 12 bits */ } /* The eolist() function is used to trace down a list of entries with duplicate keys until the last duplicate is found. */ static unsigned eolist(index) /* find last duplicate */ unsigned int index; { int temp; while(temp=string_tab[index].next) /* while more duplicates */ index = temp; return index; } /* The hash() routine is used to find a spot in the hash table for a new entry. It performs a "hash and linear probe" lookup, using h() to calculate the starting hash value and eolist() to perform the linear probe. This routine DOES NOT detect a table full condition. That MUST be checked for elsewhere. */ static unsigned hash(pred,foll) /* find spot in the string table */ unsigned int pred; /* code for preceeding string */ unsigned char foll; /* char following string */ { unsigned int local, tempnext; /* scratch storage */ struct entry *ep; /* allows faster table handling */ local = h(pred,foll); /* get initial hash value */ if(!string_tab[local].used) /* if that spot is free */ return local; /* then that's all we need */ else /* else a collision has occured */ { local = eolist(local); /* move to last duplicate */ /* We must find an empty spot. We start looking 101 places down the table from the last duplicate. */ tempnext = (local+101) & 0x0FFF; ep = &string_tab[tempnext]; /* initialize pointer */ while(ep->used) /* while empty spot not found */ { if(++tempnext==TABSIZE) /* if we are at the end */ { tempnext = 0; /* wrap to beginning of table*/ ep = string_tab; } else ++ep; /* point to next element in table */ } /* local still has the pointer to the last duplicate, while tempnext has the pointer to the spot we found. We use this to maintain the chain of pointers to duplicates. */ string_tab[local].next = tempnext; return tempnext; } } /* The unhash() function is used to search the hash table for a given key. Like hash(), it performs a hash and linear probe search. It returns either the number of the entry (if found) or NOT_FND (if not found). */ static unsigned unhash(pred,foll) /* search string table for a key */ unsigned int pred; /* code of preceeding string */ unsigned char foll; /* character following string */ { unsigned int local, offset; /* scratch storage */ struct entry *ep; /* this speeds up access */ local = h(pred,foll); /* initial hash */ while(1) { ep = &string_tab[local]; /* speed up table access */ if((ep->predecessor==pred) && (ep->follower==foll)) return local; /* we have a match */ if(!ep->next) /* if no more duplicates */ return NOT_FND; /* then key is not listed */ local = ep->next; /* move on to next duplicate */ } } /* The init_tab() routine is used to initialize our hash table. You realize, of course, that "initialize" is a complete misnomer. */ static init_tab() /* set ground state in hash table */ { unsigned int i; /* table index */ setmem((char *)string_tab,sizeof(string_tab),0); for(i=0; i<256; i++) /* list all single byte strings */ upd_tab(NO_PRED,i); inbuf = outbuf = EMPTY; /* nothing is in our buffers */ } /* The upd_tab routine is used to add a new entry to the string table. As previously stated, no checks are made to ensure that the table has any room. This must be done elsewhere. */ upd_tab(pred,foll) /* add an entry to the table */ unsigned int pred; /* code for preceeding string */ unsigned int foll; /* character which follows string */ { struct entry *ep; /* pointer to current entry */ /* calculate offset just once */ ep = &string_tab[hash(pred,foll)]; ep->used = TRUE; /* this spot is now in use */ ep->next = 0; /* no duplicates after this yet */ ep->predecessor = pred; /* note code of preceeding string */ ep->follower = foll; /* note char after string */ } /* This algorithm encodes a file into twelve bit strings (three nybbles). The getcode() and putcode() routines are used to output these strings a byte (or two) at a time. */ static getcode(fd) /* read in a twelve bit code */ FILE *fd; /* file to get code from */ { unsigned int localbuf, returnval; if(inbuf==EMPTY) /* if on a code boundary */ { if((localbuf=getc_unp(fd))==EOF) /* get start of next code */ return EOF; /* pass back end of file status */ localbuf &= 0xFF; /* mask down to true byte value */ if((inbuf=getc_unp(fd))==EOF) /* get end of code, start of next */ return EOF; /* this should never happen */ inbuf &= 0xFF; /* mask down to true byte value */ returnval = ((localbuf<<4)&0xFF0) + ((inbuf>>4)&0x00F); inbuf &= 0x000F; /* leave partial code pending */ } else /* buffer contains first nybble */ { if((localbuf=getc_unp(fd))==EOF) return EOF; localbuf &= 0xFF; returnval = localbuf + ((inbuf<<8)&0xF00); inbuf = EMPTY; /* note no hanging nybbles */ } return returnval; /* pass back assembled code */ } static putcode(fd,code) /* output a twelve bit code */ FILE *fd; /* file to send code to */ unsigned int code; /* code to send */ { int localbuf; if(outbuf==EMPTY) /* if no hanging nybbles */ { putch(((code>>4)&0xFF),fd); /* then output first two nybbles */ outbuf = code & 0x000F; /* and save third for later */ } else /* else dump pending nybble */ { putch((((outbuf<<4)&0xFF0) + ((code>>8)&0x00F)),fd); putch((code&0x00FF),fd); /* last two nybbles can go also */ outbuf = EMPTY; /* and no nybbles left hanging */ } } /* The putch() routine is used to write bytes to the output file, as certain checks must be performed. */ static putch(c,fp) /* output a byte value */ char c; /* the byte value to put */ FILE *fp; /* the file to output to */ { lzwlen++; /* bump our length counter */ if(fp) /* if output file is real */ if(fputc(c,fp)==EOF) /* then make sure this works */ abort("Write fail (disk full?)"); } /* The flushout() routine must be called to flush out any partial code which gets left behind. */ static flushout(fd) /* flush out the final code */ FILE *fd; /* file to send code to */ { if(outbuf!=EMPTY) /* dump any hanging nybble */ putch(((outbuf<<4)&0xFF0),fd); } static push(c) /* push char onto stack */ int c; /* character to push */ { stack[sp] = ((char) c); /* coerce integer into a char */ if(++sp >= TABSIZE) abort("Stack overflow\n"); } static int pop() /* pop character from stack */ { if(sp>0) return ((int) stack[--sp]); /* leave ptr at next empty slot */ else return EMPTY; } /***** LEMPEL-ZEV DECOMPRESSION *****/ static int code_count; /* needed to detect table full */ static unsigned code; /* where we are so far */ static int firstc; /* true only on first character */ init_ucr() /* get set for uncrunching */ { sp = 0; /* clear out the stack */ init_tab(); /* set up atomic code definitions */ code_count = TABSIZE - 256; /* note space left in table */ firstc = 1; /* true only on first code */ } int getc_ucr(f) /* get next uncrunched byte */ FILE *f; /* file containing crunched data */ { unsigned int c; /* a character of input */ int code, newcode; static int oldcode, finchar; struct entry *ep; /* allows faster table handling */ if(firstc) /* first code is always known */ { firstc = FALSE; /* but next will not be first */ oldcode = getcode(f); return finchar = string_tab[oldcode].follower; } if(!sp) /* if stack is empty */ { if((code=newcode=getcode(f))==EOF) return EOF; ep = &string_tab[code]; /* initialize pointer */ if(!ep->used) /* if code isn't known */ { code = oldcode; ep = &string_tab[code]; /* re-initialize pointer */ push(finchar); } while(ep->predecessor!=NO_PRED) { push(ep->follower); /* decode string backwards */ code = ep->predecessor; ep = &string_tab[code]; } push(finchar=ep->follower); /* save first character also */ /* The above loop will terminate, one way or another, with string_tab[code].follower equal to the first character in the string. */ if(code_count) /* if room left in string table */ { upd_tab(oldcode,finchar); --code_count; } oldcode = newcode; } return pop(); /* return saved character */ } /***** LEMPEL-ZEV COMPRESSION *****/ init_cr() /* initialize for Lempel-Zev pack */ { init_tab(); /* set string table ground state */ code_count = TABSIZE - 256; /* set initial table space */ firstc = 1; /* set up special case */ lzwlen = 0; /* reset length counter */ } putc_cr(c,t) /* output a byte crunched */ char c; /* character to crunch */ FILE *t; /* file to write to */ { unsigned localcode; if(firstc) /* special case for first char */ { code = unhash(NO_PRED,c); /* set initial code for table */ firstc = 0; /* no longer first char */ return; } if((localcode=unhash(code,c))!=NOT_FND) { code = localcode; return; } /* When the above clause comes false, we have found the code for the longest known code string. */ putcode(t,code); /* save code for string thus far */ if(code_count) /* if more room in string table */ { upd_tab(code,c); /* then add the new string */ --code_count; } /* Now we start the loop again, starting with the character that didn't fit in the last string. */ code = unhash(NO_PRED,c); } fini_cr(t) /* finish up after crunching */ FILE *t; /* file to dump to */ { putcode(t,code); /* always one unsent code left */ flushout(t); /* make sure everything's written */ }